Supplementary Data File 3 -

This is the THIRD script used to generate the main figures for the the manuscript by Proctor et al. titled “A spatial gradient of bacterial diversity in the human oral cavity shaped by salivary flow”

This script and associated data are provided via (c) by Diana M Proctor, Julia A. Fukuyama, Susan P. Holmes, David A. Relman. These data and the associated script are licensed under the Creative Commons Attribution-ShareAlike 4.0 International License (CC-BY-CA).

Given attribution, you are free to: 1) Share, copy and redistribute the material in any medium or format 2) Adapt, remix, transform, and build upon the material for any purpose, even commercially.

To see the full license associated with attribution of this work, see the CC-By-CA license, see http://creativecommons.org/licenses/by-sa/4.0/.

this script is used to generate the main figures for the manuscript

Run the decontamination script (Supplementary Data File 1) first. Then run this one. Most of the supplemental figures are in an accompanying files (Supplementary Data Files 3-4). The local name of this file on my desktop is: Proctor_MainFigures_v28.0.Rmd.

Import the data (from the decontamination script) and create a supragingival dataset

## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
## 
##     distance
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:phyloseq':
## 
##     sampleNames
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 7002 samples ]
## sample_data() Sample Data:       [ 7002 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 7 taxonomic ranks ]

0a. Description of data: Load in the discovery dataset and get summary stats for the dataset

load("~/Desktop/Proctor_NatureComm/mouth.RData")
discovery = mouthPhy

#how many samples and taxa
discovery
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 724 taxa and 1905 samples ]
## sample_data() Sample Data:       [ 1905 samples by 27 sample variables ]
## tax_table()   Taxonomy Table:    [ 724 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 724 tips and 723 internal nodes ]
#what is the sequencing depth?
summary(sample_sums(discovery))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1539    2258    2259    2931   11181
#how many subjects are in this dataset?
length(levels(as.factor(sample_data(discovery)$Subject)))
## [1] 11

0b. Description of data: Load in the validation dataset and get summary stats for the dataset

#how many samples are in the validation dataset?
nsamples(supra)
## [1] 7002
#how many subjects are in the validation dataset?
length(levels(sample_data(supra)$Subject))
## [1] 19
#how many healthy aim 1 subjects are in the validation dataset?
aim1 = subset_samples(supra, Aim=="SA1" | Aim=="Pilot")
length(levels(sample_data(aim1)$Subject))
## [1] 9
#how many healthy aim 3 subjects are in the validation dataset?
aim3 = subset_samples(supra, Aim=="SA3")
length(levels(sample_data(aim3)$Subject))
## [1] 10
#what is the median sequencing depth of the validation dataset?
summary(sample_sums(supra))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1   61920   73772   70563   84410  213919

0.c. Description of data: Load in the mucosal biogeography dataset and get the sequencing statistics

library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.4-4
load("~/Desktop/Proctor_NatureComm/Biogeo1_phys.Rdata")
#fix the sample names in the Biogeo object
names <- sample_names(Biogeo1_phys)
names <- str_replace_all(names, ("aaa"), ".")
names <- str_replace_all(names, ("zzz"), ".")
sample_names(Biogeo1_phys) <- names
#use version 5 of the mapping file, includes coordinates
map <- import_qiime(mapfilename="~/Desktop/Proctor_NatureComm/biogeo_mapping_file_corrected_v5.0.txt")
## Processing map file...
sample_data(Biogeo1_phys) <- map

sites = c("AG", "BM", "KG", "Supra")
#subset on the sites analyzed in this study
Biogeo2_phys <- subset_samples(Biogeo1_phys, Site.New %in% sites)
sample_data(Biogeo2_phys)$TissueClass = ifelse(sample_data(Biogeo2_phys)$Site.New =="Supra", "Non-shedding            ", "Shedding               ")

#note that alveolar mucosa and keratinized gingiva sites are mislabeled; switch them
### Fix the other sample site labels
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(AG)", "Keratinized Gingiva   ") 
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(BM)", "Buccal Mucosa   ")
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(KG)", "Alveolar Mucosa   ")
sample_data(Biogeo2_phys)$Site.New = str_replace_all(sample_data(Biogeo2_phys)$Site.New, "(Supra)", "Supragingival Tooth   ")

#how many samples are in the mucosal biogeo dataset?
nsamples(Biogeo2_phys)
## [1] 542
#library(dada2)
#sequence table from rdp classifier on rdp database
#tax = tax_table(Biogeo1_phys)
#set.seed(100) # Initialize random number generator for reproducibility
#taxa <- assignTaxonomy(rownames(tax), "~/Dropbox/rdp_train_set_14.fa.gz", minBoot=80)
#taxaSpecies <- addSpecies(taxa, "~/Dropbox/rdp_species_assignment_14.fa.gz", verbose=TRUE)
#tax_table(Biogeo1_phys) <- taxaSpecies


#blast some of the sequences and create a new variable column for the highest taxonomic rank
#write.csv(data.frame(tax_table(Biogeo1_phys), taxa_sums(Biogeo1_phys)), file="biogeo_species_v1.0.csv")

#read in the new taxonomic table
tax2 = read.csv("~/Desktop/Proctor_NatureComm/biogeo_species_v3.0.csv", header=TRUE, row.names=1)
tax3 = tax_table(tax2)
## Warning in .local(object): Coercing from data.frame class to character matrix 
## prior to building taxonomyTable. 
## This could introduce artifacts. 
## Check your taxonomyTable, or coerce to matrix manually.
taxa_names(tax3) = rownames(tax2)
colnames(tax3) = colnames(tax2)
tax_table(Biogeo2_phys) = tax3

#relabel with the highest rank designation
taxa_names(Biogeo2_phys) = tax_table(Biogeo2_phys)[,8]

#filter to retain taxa present in 15% of samples
Biogeo2_phys15 = filter_taxa(Biogeo2_phys, function(x) sum(x > 10) > (0.15 * length(x)),  TRUE)
Biogeo2_phys15 = prune_taxa(taxa_sums(Biogeo2_phys15) > 0, Biogeo2_phys15)
Biogeo2_phys15 = prune_samples(sample_sums(Biogeo2_phys15) > 0, Biogeo2_phys15)
Biogeo2_phys15 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 101 taxa and 542 samples ]
## sample_data() Sample Data:       [ 542 samples by 27 sample variables ]
## tax_table()   Taxonomy Table:    [ 101 taxa by 12 taxonomic ranks ]
### Set up several otu tables with different transformations
#raw data

#vst-transformed data
### Stabilize the variance
#create a deseq object
Biogeo2_VST <- Biogeo2_phys15
otu_table(Biogeo2_VST) <- otu_table(Biogeo2_VST) + 1
Biogeo2.ds = phyloseq_to_deseq2(Biogeo2_VST, ~Subject + Site.New)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#variance stabilizing transformation
Biogeo2_phys15_VST <- varianceStabilizingTransformation(Biogeo2.ds , blind=TRUE, fitType="local") 
biogeo_counts_VST <- otu_table(as.matrix(assay(Biogeo2_phys15_VST)), taxa_are_rows=TRUE)
otu_table(Biogeo2_VST) <- biogeo_counts_VST 
otus.VST = data.frame(t(otu_table(Biogeo2_VST)))

#how many samples are in the mucosal biogeo dataset?
nsamples(Biogeo2_phys)
## [1] 542
#how many supra samples and mucosal samples are there
sb = data.frame(sample_data(Biogeo2_phys))
table(sb$Site.New)
## 
##     Alveolar Mucosa          Buccal Mucosa    Keratinized Gingiva    
##                    126                     82                    166 
## Supragingival Tooth    
##                    168
#how many mucosal samples are there
sb = subset(sb, Site.New != "Supragingival Tooth   ")
sum(table(sb$Site.New))
## [1] 374
#how many subjects / samples per subject are in the mucosal biogeo dataset?
table(sb$Subject)
## 
## 4-101 4-102 4-103 
##   124   124   126
#what is the median sequencing depth of the mucosal biogeo dataset?
summary(sample_sums(Biogeo2_phys))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1   70991   82354   77684   91721  123925

0.d how many samples are in the combined data?

#how many taxa are in the decontaminated validation data
supra
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 7002 samples ]
## sample_data() Sample Data:       [ 7002 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 10 taxonomic ranks ]
nsamples(discovery)+nsamples(Biogeo2_phys)+nsamples(supra)
## [1] 9449

0.e. Description of data: Phyla level composition survey

#What is the relative abundance of each Phylum?
library(doBy)
supra.phy <- tax_glom(supra, taxrank="Phylum")
tax.count <- data.frame(tax_table(supra.phy)[,2:9], taxa_sums(supra.phy))
colnames(tax.count) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species", "Highest",  "Abundance")
Phylum_df <- summaryBy(Abundance~Phylum, data=tax.count, FUN=sum)
Phylum_df $Percent <- round(Phylum_df $Abundance.sum/sum(Phylum_df $Abundance)*100, 4)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:S4Vectors':
## 
##     rename
Phylum_df <- arrange(Phylum_df, desc(Percent))
Phylum_df
##                         Phylum Abundance.sum Percent
## 1                   Firmicutes     196601153 39.8095
## 2               Proteobacteria     139356937 28.2182
## 3               Actinobacteria     108027071 21.8742
## 4                Bacteroidetes      34477486  6.9813
## 5                 Fusobacteria      14858582  3.0087
## 6                          SR1        340398  0.0689
## 7                 Spirochaetes        121010  0.0245
## 8    Cyanobacteria/Chloroplast         52172  0.0106
## 9  Candidatus_Saccharibacteria         10815  0.0022
## 10                   Aquificae          3559  0.0007
## 11               Acidobacteria          2799  0.0006
## 12         Deinococcus-Thermus          1701  0.0003
## 13                 Tenericutes          1296  0.0003
write.csv(Phylum_df, file="~/Desktop/TableS1.csv")
#how much do the top 5 phyla contribute to total abundance?
top5 <- Phylum_df[1:5, ]
round(sum(top5$Percent),2)
## [1] 99.89
#What are the top 5 Phyla?
top5
##           Phylum Abundance.sum Percent
## 1     Firmicutes     196601153 39.8095
## 2 Proteobacteria     139356937 28.2182
## 3 Actinobacteria     108027071 21.8742
## 4  Bacteroidetes      34477486  6.9813
## 5   Fusobacteria      14858582  3.0087

0.f. Description of data: genus-level summary of data

#What is the number of Genera found?
length(get_taxa_unique(supra, taxonomic.rank="Genus"))
## [1] 118
#what are the abundance levels of each genus?
supra.genus <- tax_glom(supra, taxrank="Genus")
tax.count <- data.frame(tax_table(supra.genus)[,2:9], taxa_sums(supra.genus))
colnames(tax.count) <- c("Kingdom", "Phylum","Class","Order","Family","Genus", "Species", "Highest",  "Abundance")
Genus_df<- summaryBy(Abundance~Genus, data=tax.count, FUN=sum)
Genus_df$Percent <- round(Genus_df$Abundance.sum/sum(Genus_df$Abundance)*100, 4)
library(plyr)
Genus_df <- arrange(Genus_df, desc(Percent))
Genus_df <- Genus_df[order(Genus_df$Percent, decreasing=TRUE), ] 

#how much do the top 10 genera contribute to total abundance?
top10 <- Genus_df[1:10, ]
round(sum(top10$Percent),3)
## [1] 88.198
#what about the top 5
top5 <- Genus_df[1:5, ]
round(sum(top5$Percent),3)
## [1] 72.43
###How diverse are the top 10 genera? i.e., how many species are there per genus?
top10 <- as.vector(Genus_df$Genus[1:10])
Diversity.list <- vector("list", 10)
names(Diversity.list) <- top10

for(i in 1:length(top10)){
       physub = subset_taxa(supra, Genus==top10[i])
       physub = prune_taxa(taxa_sums(physub) > 0, physub)
       Diversity.list[[i]] <- physub
}

#compute the number of taxa in each element of the list
Ntaxa <- data.frame(unlist(lapply(Diversity.list, ntaxa)))

colnames(Ntaxa) <- "N.Species"
#Make a table with percent abundance and number of taxa
genus.tab <- data.frame(Genus_df[1:10,], Ntaxa)
library(knitr)
kable(genus.tab, format="markdown")
Genus Abundance.sum Percent N.Species
Streptococcus 143337443 31.6955 11
Haemophilus 64073059 14.1682 6
Rothia 61438005 13.5855 6
Neisseria 32952227 7.2866 10
Actinomyces 25749541 5.6939 17
Corynebacterium 19371708 4.2836 4
Veillonella 17758620 3.9269 10
Abiotrophia 11542724 2.5524 1
Gemella 11334234 2.5063 2
Prevotella 11301369 2.4990 46
write.csv(genus.tab, file="~/Desktop/TableS2.csv")

1a. Perform PCoA on Bray Curtis and use an adonis to partitiion variation in to time, tooth class, tooth aspect, and subject

library(RColorBrewer)
### Create an index tooth biogeography subset
index <- subset_samples(supra, Protocol=="Home")
sites <- c("Molar", "Incisor_Central")
index_subsites <- subset_samples(index, Tooth_Class %in% sites)
index_subsites
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 3400 samples ]
## sample_data() Sample Data:       [ 3400 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 10 taxonomic ranks ]
#how many samples are in average 2 subjects? Looks like about 25% of samples
mean(table(sample_data(index_subsites)$Subject))
## [1] 425
#filter the taxa to include those present in 25% of samples
filtergroup = filterfun(kOverA(k=850, A=1)) #k = number of samples; A = abundance
        index25 = filter_taxa(index_subsites, filtergroup, prune=TRUE) 
        index25 = prune_samples(sample_sums(index25) > 0, index25) 

        #note the following command drops 7 samples (depth between 1-35)
        index25 = prune_samples(sample_sums(index25) > 800, index25) 
        index25
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 90 taxa and 3393 samples ]
## sample_data() Sample Data:       [ 3393 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 90 taxa by 10 taxonomic ranks ]
#fix the labeling of the subject ids 
#relabel for prettier plotting
sample_data(index25)$Tooth_Class <- str_replace_all(sample_data(index25)$Tooth_Class, "(Incisor_Central)", "Incisor")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-101)", "Subject 1")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-102)", "Subject 2")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-104)", "Subject 3")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-105)", "Subject 4")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-106)", "Subject 5")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(1-107)", "Subject 6")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(P1-8)", "Subject 7")
sample_data(index25)$Subject <- str_replace_all(sample_data(index25)$Subject, "(P1-9)", "Subject 8")

#force the labeling of the subjects
ordering= c("Subject 1", "Subject 2",  "Subject 3" , "Subject 4" , "Subject 5" ,"Subject 6", "Subject 7" ,"Subject 8")
sample_data(index25)$Subject  <- factor(sample_data(index25)$Subject, levels = ordering)


#hellinger transform the data
otus = data.frame(otu_table(index25))     
library(vegan)
otus.h = decostand(otus, "hellinger")  

#Do PCOA
#nmds on bray curtis distance of hellinger transformation
h.euc = vegdist(otus.h, "bray")
h.nmds = cmdscale(h.euc, eig=TRUE, k=10)
h.map = data.frame(sample_data(index25), scores(h.nmds))
h.map$method = "Hellinger"
h.eig = 100*(h.nmds$eig/sum(h.nmds$eig))
head(h.eig)
## [1] 23.478257 10.985589  9.710830  7.991654  6.795043  5.868218
#make a screeplot
evals <- h.nmds$eig
foo = data.frame(h.eig, as.numeric(1:length(h.eig)))
colnames(foo) = c("percent.variation", "rank")
p=ggplot(foo[1:10,], aes(rank, percent.variation)) + geom_point() + theme_bw()
p

1c. Figure 1: Samples segregate by tooth class along the first axis

#look at all the data as a function of tooth class
p1=plot_ordination(index25, h.nmds, type="samples", color="Tooth_Class") + geom_point(size=2, alpha=0.1) + theme_bw() +theme(legend.position = "none") + theme(text = element_text(size=18), axis.text.x = element_text(angle=0, vjust=1))+coord_fixed(sqrt(evals[2] / evals[1]))
p.dat = p1$data

##### plot figure panels
#1a
fig1a=plot_ordination(index25, h.nmds, type="samples", color="Subject") + geom_point(alpha=0.4) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + scale_colour_brewer(palette="Dark2")+coord_fixed(sqrt(evals[2] / evals[1]))  + ggtitle("a)") + xlab("Axis 1 (23.48%)") + ylab("Axis 2 (10.99%)")


#1b
fig1b=plot_ordination(index25, h.nmds, type="samples", color="Tooth_Class")   + geom_point(alpha=0.4)  + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))  +coord_fixed(sqrt(evals[2] / evals[1])) + guides(color=guide_legend(title="Tooth Class")) + ggtitle("b)")  + xlab("Axis 1 (23.48%)") + ylab("Axis 2 (10.99%)")

#1c
fig1c=plot_ordination(index25, h.nmds, type="samples", color="Tooth_Aspect")   + geom_point(alpha=0.4) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + scale_colour_brewer(palette="Set1") +coord_fixed(sqrt(evals[2] / evals[1]))+ guides(color=guide_legend(title="Tooth Aspect")) + ggtitle("c)")  + xlab("Axis 1 (23.48%)") + ylab("Axis 2 (10.99%)")

#1d
fig1d <- ggplot(p.dat, aes(Tooth_Class, Dim1, fill=Tooth_Class)) + geom_boxplot(notch=TRUE) + facet_wrap(~Tooth_Aspect) + theme_bw() + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) +  ylab("Axis 1 (23.48%)") + xlab("")+ guides(fill=guide_legend(title="Tooth Class")) + ggtitle("d)") 

grid.arrange(fig1a, fig1b, fig1c, fig1d, ncol=2)

#save figure 1
ggsave(grid.arrange(fig1a, fig1b, fig1c, fig1d, ncol=2), file="~/Dropbox/Figure1.eps",  width = 10, height = 8, units ="in",dpi = 300, device="eps")
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page
#how many comparisons are there in the boxplots
#buccal
b = subset(p.dat, Tooth_Aspect=="Buccal")
table(b$Tooth_Class)
## 
## Incisor   Molar 
##     849     849
#lingual
l = subset(p.dat, Tooth_Aspect=="Lingual")
table(l$Tooth_Class)
## 
## Incisor   Molar 
##     849     846

1d. Quantify the separation of samples along the axes based on tooth class and tooth aspect

p1.dat = fig1d$data

#how many incisors have negative scores
test1 = subset(p1.dat, Tooth_Class=="Incisor" & Dim1 > 0)
testb = subset(p1.dat, Tooth_Class=="Incisor")
nrow(test1)/nrow(testb)
## [1] 0.6943463
#of incisors with + scores, many incisors have scores > than 0.25?
test2 = subset(p1.dat, Tooth_Class=="Incisor" & Dim1 > 0.25)
nrow(test2)/nrow(test1)
## [1] 0.259542
#how many molar communities have positive scores 
test3 = subset(p1.dat, Tooth_Class=="Molar" & Dim1 > 0)
testb = subset(p1.dat, Tooth_Class=="Molar")
nrow(test3)/nrow(testb)
## [1] 0.2637168
#how many molars have scores > than 0.25
test4 = subset(p1.dat, Tooth_Class=="Molar" & Dim1 > 0.25)
testb = subset(p1.dat, Tooth_Class=="Molar" ) 
nrow(test4)/nrow(test3)
## [1] 0.006711409
## see how many lingual samples have scores less than 0.25 sicnce there seems to be an interaction with most lingual incisors falling below 0.25
test5 = subset(p1.dat, Tooth_Aspect=="Lingual" & Dim1 > 0)
testb = subset(p1.dat, Tooth_Aspect=="Lingual" ) 
nrow(test5)/nrow(testb)
## [1] 0.5451327

1e. Merge temporal replicates by subject so that we can test the nmds model using Adonis

#merge the temporal replicates within each subject to avoid temporal pseudoreplication        
subs <- levels(sample_data(index25)$Subject) 
holder <-vector('list',length(subs)) 


for(i in 1:length(subs)){ 
    subx = subset_samples(index25, Subject==subs[[i]])
    subh = merge_samples(subx, "Habitat")
    sample_data(subh)$Subject = subs[[i]]
    sample_data(subh)$Habitat = sample_names(subh)
    sample_names(subh) = paste0(sample_data(subh)$Subject, "_", sample_names(subh))
    holder[[i]] = subh
}

is = merge_phyloseq(holder[[1]], holder[[2]], holder[[3]], holder[[4]], holder[[5]], holder[[6]], holder[[7]])
is2 = prune_taxa(taxa_sums(is) > 0, is)
is2 = prune_samples(sample_sums(is2) > 0, is2)
is2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 90 taxa and 112 samples ]
## sample_data() Sample Data:       [ 112 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 90 taxa by 10 taxonomic ranks ]
#set up a new mapping file to correct what was lost during the merge
        foo = sample_names(is2)
        foo2 <- str_replace_all(foo, "([B,L])", replacement =c(".Buccal", ".Lingual"))
        foo2 <- str_replace_all(foo2,  "_", ".")
        map <- colsplit(foo2, "([.])", c("Subject", "Junk", "Tooth_Number", "Tooth_Aspect"))
        map = data.frame(sample_data(is2)$Habitat, map)
        colnames(map)[1] = "Habitat"
         
        #merge in the mapping file information
        dot = read.csv("~/Desktop/Proctor_NatureComm/mytoothdot_coordinates_FullMouth.csv")
        colnames(dot)[1] = "Habitat"
        
        #merge maps
        library(plyr)
        map2 = join(map, dot, by="Habitat")
        
        #make the sample names of map2 the same as the names in bs2 then replace the mapping file
        names = paste0(map2$Subject, "_", map2$Habitat)
        rownames(map2) = names
        sample_data(is2) = map2

1f. Use adonis to see how much variation each factor explains after adjusting for temporal replication

#library(phyloseq)
library(vegan)
otus = data.frame(otu_table(is2))
otus.h = decostand(otus, "hellinger")
otus.euc = vegdist(otus.h, "euclidean")
attach(sample_data(is2))
set.seed(12)
index.adonis <- adonis(otus.euc ~Subject*Tooth_Class*Tooth_Aspect, permutations=1000, strata = Subject) 
df <- format(data.frame(index.adonis$aov.tab), digits=2)
kable(df, format="markdown")
Df SumsOfSqs MeanSqs F.Model R2 Pr..F.
Subject 6 13.22 2.204 35.7 0.469 0.001
Tooth_Class 1 2.64 2.642 42.7 0.094 0.001
Tooth_Aspect 1 1.06 1.060 17.1 0.038 0.001
Subject:Tooth_Class 6 2.69 0.448 7.3 0.095 0.001
Subject:Tooth_Aspect 6 1.21 0.202 3.3 0.043 0.001
Tooth_Class:Tooth_Aspect 1 1.22 1.216 19.7 0.043 0.001
Subject:Tooth_Class:Tooth_Aspect 6 0.99 0.165 2.7 0.035 0.004
Residuals 84 5.19 0.062 NA 0.184 NA
Total 111 28.22 NA NA 1.000 NA
#write.csv(df, file="~/Dropbox/Figures_20170617/Revised/FinalFigures/TableS3.csv")
detach(sample_data(is2))

1g. Check to see if sequencing depth explains these differences between groups

#do we see a difference in sequencing depth by tooth location, no we don't for most subjects 
df = data.frame(sample_sums(is2), sample_data(is2))
colnames(df)[1] = "abundance"

#does sequencing depth vary by tooth class on lingual surfaces?
ling= subset(df, Tooth_Aspect=="Lingual")
wilcox.test(abundance~Tooth_Class, data=ling, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test
## 
## data:  abundance by Tooth_Class
## W = 324, p-value = 0.271
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -219138   58331
## sample estimates:
## difference in location 
##                 -73439
#whatabout buccal surfaces
buc= subset(df, Tooth_Aspect=="Buccal")
wilcox.test(abundance~Tooth_Class, data=buc, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test
## 
## data:  abundance by Tooth_Class
## W = 299, p-value = 0.1303
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -171951   26164
## sample estimates:
## difference in location 
##               -78530.5

1f. Does alpha diversity differ between the buccal and lingual tooth aspects in human health?

mi= subset_samples(supra, Tooth_Class %in% c("Incisor_Central", "Molar"))
mi = prune_samples(sample_sums(mi) > 0, mi)


meths = c("Shannon", "Chao1", "Simpson")
sites = levels(as.factor(sample_data(mi)$Habitat))
holder <-vector('list',length(sites)) 

for(i in length(sites))
  {
  # Estimate diversity and make data frame
  erDF <- estimate_richness(mi, split = TRUE, measures = meths)
  df <- data.frame(erDF, sample_data(mi))
  holder[[i]] <- df
}

df <- data.frame(do.call("rbind", holder)) 


#test to see whether alpha diversity varies based on tooth class
wilcox.test(Chao1~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Chao1 by Tooth_Class
## W = 1133700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -24.00003 -21.99993
## sample estimates:
## difference in location 
##              -22.99995
wilcox.test(Shannon~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Shannon by Tooth_Class
## W = 1686900, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.2663786 -0.2123560
## sample estimates:
## difference in location 
##             -0.2393575
wilcox.test(Simpson~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Simpson by Tooth_Class
## W = 1985700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.02320114 -0.01541780
## sample estimates:
## difference in location 
##            -0.01931219

1g. is this true for clinic samples only?

  • subjects often report that it is difficult to collect lingual samples
  • we want to know whether this is true of the clinic samples only
  • we see that it isn’t so the differences that we are seeing are likely not a result of differences in alpha diversity
mi= subset_samples(mi, Protocol=="Clinic")
mi = prune_samples(sample_sums(mi) > 0, mi)
mi
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 984 samples ]
## sample_data() Sample Data:       [ 984 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 10 taxonomic ranks ]
meths = c("Shannon", "Chao1", "Simpson")
sites = levels(as.factor(sample_data(mi)$Habitat))
holder <-vector('list',length(sites)) 

for(i in length(sites))
  {
  # Estimate diversity and make data frame
  erDF <- estimate_richness(mi, split = TRUE, measures = meths)
  df <- data.frame(erDF, sample_data(mi))
  holder[[i]] <- df
}

df <- data.frame(do.call("rbind", holder)) 


#test to see whether there are two class differences
#test to see whether alpha diversity varies based on tooth class
wilcox.test(Chao1~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Chao1 by Tooth_Class
## W = 58948, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -21.99995 -15.99996
## sample estimates:
## difference in location 
##              -18.99996
wilcox.test(Shannon~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Shannon by Tooth_Class
## W = 82205, p-value = 1.015e-09
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.2919400 -0.1525899
## sample estimates:
## difference in location 
##             -0.2219933
wilcox.test(Simpson~Tooth_Class, data=df, exact=TRUE, conf.int=TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Simpson by Tooth_Class
## W = 90641, p-value = 4.082e-05
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -0.03257463 -0.01132578
## sample estimates:
## difference in location 
##             -0.0217224

2.1. Figure 2: Test to see if this variation is consistent with an ecological gradient across all tooth classes

  • in this case, we will sum temporal replicates to deal with the problem of pseudoreplication
#### 1. Compute the third-order orthogonal polynomial terms
#create validation datset #2 for time x space analysis, all teeth, healthy subjects
keep <- c("1-101", "1-102", "1-103", "1-104", "1-105", "1-106", "1-107")
FullMouth <- subset_samples(supra, Subject %in% keep & Protocol=="Clinic")
FullMouth <- prune_taxa(taxa_sums(FullMouth) > 0, FullMouth)
FullMouth
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 469 taxa and 1701 samples ]
## sample_data() Sample Data:       [ 1701 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 469 taxa by 10 taxonomic ranks ]
#filter the data
filtergroup = filterfun(kOverA(k=200, A=1)) #k = number of samples; A = abundance
        f1 = filter_taxa(FullMouth, filtergroup, prune=TRUE) 
        f1 = prune_taxa(taxa_sums(f1) > 0, f1)
        f1 = prune_samples(sample_sums(f1) > 0, f1)
        f1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 117 taxa and 1701 samples ]
## sample_data() Sample Data:       [ 1701 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 117 taxa by 10 taxonomic ranks ]
#fix the variable labeling for prettier plotting
sample_data(f1)$Tooth_Class <- str_replace_all(sample_data(f1)$Tooth_Class, "(Incisor_Central)", "Incisor")
sample_data(f1)$Tooth_Class <- str_replace_all(sample_data(f1)$Tooth_Class, "(Incisor_Lateral)", "Incisor")
sample_data(f1)$Tooth_Class <- str_replace_all(sample_data(f1)$Tooth_Class, "(Molar_Pre)", "Premolar")

#hellinger transform the data
library(vegan)
otus = data.frame(otu_table(f1))
otus.h = decostand(otus, "hellinger")
map = sample_data(f1)
map$x = as.numeric(as.character(map$x))
map$y = as.numeric(as.character(map$y))

#center the coordinates
xygrid = cbind(map$x, map$y)
xygrid.c <- scale(xygrid, scale=FALSE)

### Compute the third-order orthogonal polynomial function using centered geographic coordinates
poly.xy3 <- poly(xygrid.c, degree = 3, raw=FALSE) #, coefs=TRUE) 
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)

2.2. Perform trend surface analysis

  • use the 9 polynomial terms - note: this is comparable to what we’ve already done, but instead of using VST transformed data we use the hellinger transformed data as the reviewer suggested
  • use the hellinger transformed data which doesn’t weight rarer or under-sampled species
  • we see that the pattern is the same, but there are more outliers with this transformation than with the other
  • we see that the molars and incisors tend to separate out on the first main axis
library(ade4)
## 
## Attaching package: 'ade4'
## The following object is masked from 'package:GenomicRanges':
## 
##     score
## The following object is masked from 'package:IRanges':
## 
##     score
## The following object is masked from 'package:BiocGenerics':
## 
##     score
#perform the RDA on the hellinger transformed data; extract the coordinates 
rld.pca <- dudi.pca(otus.h , center=TRUE, scale=TRUE, scannf=FALSE, nf=10)
rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 6)
rld.xy3.df <- data.frame(rld.xy3$ls, map)
xy3.df <- data.frame(rld.xy3.df, poly.xy3)

# how much of the variance does the trend surface model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
rld.xy3.var
## [1] 3.449022
#look at the eigenvalues
rld.xy3$eig
## [1] 1.89291876 1.01339151 0.59150811 0.18315651 0.11501567 0.08787059
## [7] 0.05979274 0.04891794 0.04278420
#look at the screeplot
screeplot(rld.xy3) #we should look at the first 3 axes

#what is the percent of ewxplained variance
Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
Explainedvariance
## [1] 46.908346 25.112815 14.658139  4.538794  2.850199  2.177518  1.481722
## [8]  1.212234  1.060234
#force x, y to numeric
xy3.df$x <- as.numeric(as.character(xy3.df$x))
xy3.df$y <- as.numeric(as.character(xy3.df$y))

### Force variables to order in ggplot
order=1:32
xy3.df$Tooth_Number <- factor(xy3.df$Tooth_Number, as.character(order))
xy3.df$tclass = xy3.df$Tooth_Class
xy3.df$tclass = str_replace_all(xy3.df$tclass, "Incisor_Central", "Incisor")
xy3.df$tclass = str_replace_all(xy3.df$tclass, "Incisor_Lateral", "Incisor")

2.3 it looks like the tooth classes separate along axis 1 in some subjects; let’s look at variation of tooth classes against the spatial coordinates

  • we see that the molars and incisors tend to separate along axis 1
  • we also see that this pattern holds true in the case where we are using hellinger transformed data; these findings are consistent with other transformations, including the variance stabilized transformation as well as the rank-abundance transformation
#plot axis 1
cbPalette <- c("grey76", "Salmon", "#00BFC4", "grey76")
a1a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis1, fill=tclass))  + theme_bw() + ylab("Axis 1") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth")  + ylim(-8, 8)  + geom_boxplot()+theme(legend.position="none")+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))

#plot axis 2
a2a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis2, fill=tclass))  + theme_bw() + ylab("Axis 2") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth")  + ylim(-8, 8) +theme(legend.position="none") + geom_boxplot()+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))


#plot axis 3
a3a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis3, fill=tclass))  + theme_bw() + ylab("Axis 3") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth")  + ylim(-8, 8) +theme(legend.position="none")  + geom_boxplot()+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))

#plot axis 4
a4a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis4, fill=tclass))  + theme_bw() + ylab("Axis 4") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth")  + ylim(-8, 8) +theme(legend.position="none") + geom_boxplot()+ scale_fill_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))

grid.arrange(a1a, a2a, a3a, a4a, ncol=2)
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
## Warning: Removed 26 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

#how many comparisons are there in the boxplots
#buccal
b = subset(xy3.df, Tooth_Aspect=="Buccal")
table(b$Tooth_Class)
## 
##   Canine  Incisor    Molar Premolar 
##      126      249      249      227
#lingual
l = subset(xy3.df, Tooth_Aspect=="Lingual")
table(l$Tooth_Class)
## 
##   Canine  Incisor    Molar Premolar 
##      125      250      248      227
2.4. Generate Figure 2a.
  • Plot axis 1 against axis 2
  • Also look at boxplots of tooth class
a1a = ggplot(xy3.df, aes(as.factor(Tooth_Number), Axis1, fill=tclass))  + theme_bw() + ylab("Axis 1") + facet_wrap(~Tooth_Aspect, ncol=1)+ xlab("Tooth") + scale_x_discrete(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5)) + xlab("Tooth")  + ylim(-8, 8)  + theme(plot.title = element_text(size=12)) + geom_boxplot()+theme(legend.position="none")+scale_fill_manual(values=cbPalette) 
a1a
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

c1 = ggplot(xy3.df, aes(Axis2, Axis1, colour=Tooth_Class))  + theme_bw()  + geom_point() + guides(color=guide_legend(title="Tooth Class")) + theme(legend.position="none")+ scale_color_manual(values=cbPalette)+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))
c1

ggsave(a1a, file="~/Dropbox/Figure2a.eps",  width = 3.5, height = 6, units ="in",dpi = 300, device="eps")
## Warning: Removed 25 rows containing non-finite values (stat_boxplot).
#ggsave(a1a, file="~/Desktop/Figure2b.png",  width = 3.5, height = 6, units ="in",dpi = 300, device="png")

2.5. get the adjusted R^2 value for the complete model and then perform forward-selection using the Blanchet et al (2008) double stopping criterion

  • ok so we think we see that there is a trend; now we want to perform forward selection on the polynomial terms
  • then identify the spatial terms that explain a significant fraction of the variance in community data
#third order non-orthogonal polynomial
fullmouth.trendnon.rda <- rda(otus.h ~., data=as.data.frame(poly.xy3))
(R2adj.poly <- RsquareAdj(fullmouth.trendnon.rda)$adj.r.squared)
## [1] 0.03295887
# do the forward selection
library(packfor)
## packfor: R Package for Forward Selection (Canoco Manual p.49)
## version0.0-8
fullmouth.trend.fwd <- forward.sel(otus.h, poly.xy3, adjR2thresh=R2adj.poly)
## Testing variable 1
## Testing variable 2
## Testing variable 3
## Testing variable 4
## Testing variable 5
## Testing variable 6
## Testing variable 7
## Procedure stopped (R2more criteria): variable 7 explains only 0.000938 of the variance.
kable(fullmouth.trend.fwd) 
variables order R2 R2Cum AdjR2Cum F pval
X^2 2 0.0116277 0.0116277 0.0110460 19.987961 0.001
Y^2 7 0.0089643 0.0205920 0.0194384 15.541338 0.001
Y 4 0.0079510 0.0285430 0.0268256 13.889299 0.001
Y^3 9 0.0032835 0.0318266 0.0295431 5.751951 0.001
X^2Y 6 0.0027969 0.0346235 0.0317757 4.910795 0.001
X^3 3 0.0012532 0.0358767 0.0324618 2.201905 0.012
#write.csv(fullmouth.trend.fwd, file="~/Dropbox/Figures_20170617/Revised/FinalFigures/TableS5.csv")

#perform a new RDA using the significant terms
fullmouth.trend.rda2 <- rda(otus.h~., data=as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]])


#what is the R2 of the new model
(R2adj.fwd <- RsquareAdj(fullmouth.trend.rda2)$adj.r.squared)
## [1] 0.03246182
set.seed(990)
#test the significance of the model 
fm.aov = anova.cca(fullmouth.trend.rda2, step=1000)
fm.aov
## Permutation test for rda under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: rda(formula = otus.h ~ `X^2` + `Y^2` + Y + `Y^3` + `X^2Y` + `X^3`, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
##            Df Variance      F Pr(>F)    
## Model       6  0.01555 10.506  0.001 ***
## Residual 1694  0.41791                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test the significance of the individual axes
set.seed(88)
fm.aov.axes = anova.cca(fullmouth.trend.rda2, step=1000, by="axis")
fm.aov.axes
## Permutation test for rda under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
## 
## Model: rda(formula = otus.h ~ `X^2` + `Y^2` + Y + `Y^3` + `X^2Y` + `X^3`, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
##            Df Variance       F Pr(>F)    
## RDA1        1  0.00710 28.7741  0.001 ***
## RDA2        1  0.00465 18.8418  0.001 ***
## RDA3        1  0.00222  8.9942  0.001 ***
## RDA4        1  0.00091  3.6707  0.006 ** 
## RDA5        1  0.00042  1.6839  0.158    
## RDA6        1  0.00026  1.0718  0.355    
## Residual 1694  0.41791                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the 5 significant spatial structures (i.e., canonical axes)
#note that scaling to 1 displays the spatial model (i.e., the linear combo of spatial variables while preserving the Euclidean distance among sites)
fullmouth.trend.fit <- scores(fullmouth.trend.rda2, choices=c(1, 2, 3, 4), display="lc", scaling=1)
fit = data.frame(fullmouth.trend.fit, sample_data(f1))
fit$x  = as.numeric(as.character(fit$x))
fit$y  = as.numeric(as.character(fit$y))

library(RColorBrewer)
myPalette = colorRampPalette(brewer.pal(11, "RdBu"), space="Lab")

#plot the first axis
p1 = ggplot(fit, aes(x, y, color=RDA1)) +ggtitle("RDA1") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

#plot the second axis
p2 = ggplot(fit, aes(x, y, color=RDA2)) +ggtitle("RDA2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ theme(plot.title = element_text(size=12))


#plot the third axis
p3 = ggplot(fit, aes(x, y, color=RDA3)) +ggtitle("RDA3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ theme(plot.title = element_text(size=12))

#plot the 4th axis
p4 = ggplot(fit, aes(x, y, color=RDA4)) +ggtitle("RDA4") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ theme(plot.title = element_text(size=12))


#visualize the significant axes fit
grid.arrange(p1, p2, p3,p4, ncol=4)

#ggsave(grid.arrange(p1, p2, p3,p4,ncol=2), file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS7a.png",  width = 10, height = 9, units ="in",dpi = 300, device="png") 

2.6. Plot the significant polynomial terms

  • by looking at these, we can gain additional insight into the spatial variables that explain variation in community structure
sig = fullmouth.trend.fwd$variables
sig = str_replace_all(sig, "[(^)]", ".")
xy.sig = data.frame(poly.xy3)[sig]

#subset on the significant polynomial terms
df = as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]]
df = data.frame(df, sample_data(f1))
df$x = as.numeric(as.character(df$x))
df$y = as.numeric(as.character(df$y))

fullmouth.trend.fwd$F.Stat = round(fullmouth.trend.fwd$F, 2)


#plot the first axis
t1 = ggplot(df, aes(x, y, color=X.2)) +ggtitle("X^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

t2 = ggplot(df, aes(x, y, color=Y.2)) +ggtitle("Y^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

t3 = ggplot(df, aes(x, y, color=Y)) +ggtitle("Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

t4 = ggplot(df, aes(x, y, color=Y.3)) +ggtitle("Y^3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

t5 = ggplot(df, aes(x, y, color=X.2Y)) +ggtitle("X^2*Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

t6 = ggplot(df, aes(x, y, color=X.3)) +ggtitle("X^3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + theme(plot.title = element_text(size=12))

grid.arrange(t1, t2, t3,t4, t5, t6, ncol=3)

#ggsave(grid.arrange(t1, t2, t3,t4, t5, t6, ncol=2), file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS7b.png",  width = 10, height = 9, units ="in",dpi = 300, device="png") 

Figure 2. How do we make sense of the gradient? Let’s look at the distribution of taxa across sites

2.8. Figure 2b: Make a between-taxa correlation matrix plot to highlight what the BCA reveals

#merge the temporal replicates within each subject to avoid temporal pseudoreplication        
subs <- levels(sample_data(f1)$Subject) 
holder <-vector('list',length(subs)) 


for(i in 1:length(subs)){ 
    subx = subset_samples(f1, Subject==subs[[i]])
    subh = merge_samples(subx, "Habitat")
    sample_data(subh)$Subject = subs[[i]]
    sample_data(subh)$Habitat = sample_names(subh)
    sample_names(subh) = paste0(sample_data(subh)$Subject, "_", sample_names(subh))
    holder[[i]] = subh
}

is = merge_phyloseq(holder[[1]], holder[[2]], holder[[3]], holder[[4]], holder[[5]], holder[[6]], holder[[7]])
is2 = prune_taxa(taxa_sums(is) > 0, is)
is2 = prune_samples(sample_sums(is2) > 0, is2)
is2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 117 taxa and 384 samples ]
## sample_data() Sample Data:       [ 384 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 117 taxa by 10 taxonomic ranks ]
#set up a new mapping file to correct what was lost during the merge
        foo = sample_names(is2)
        foo2 <- str_replace_all(foo, "([B,L])", replacement =c(".Buccal", ".Lingual"))
        foo2 <- str_replace_all(foo2,  "_", ".")
        map <- colsplit(foo2, "([.])", c("Subject", "Junk", "Tooth_Number", "Tooth_Aspect"))
        map = data.frame(sample_data(is2)$Habitat, map)
        colnames(map)[1] = "Habitat"
         
        #merge in the mapping file information
        dot = read.csv("~/Desktop/Proctor_NatureComm/mytoothdot_coordinates_FullMouth.csv")
        colnames(dot)[1] = "Habitat"
        
        #merge maps
        library(plyr)
        map2 = join(map, dot, by="Habitat")
        
        #make the sample names of map2 the same as the names in bs2 then replace the mapping file
        names = paste0(map2$Subject, "_", map2$Habitat)
        rownames(map2) = names
        sample_data(is2) = map2


#subset on the most abundant taxa for ease of visualization
library(ggcorrplot)
sub = names(sort(taxa_sums(is2), TRUE)[1:70]) 
sub2 = prune_taxa(sub, is2)

#stabilize the variance
sub2_VST = sub2
otu_table(sub2_VST) <- otu_table(sub2_VST) + 1
sub2.ds = phyloseq_to_deseq2(sub2_VST, ~Tooth_Class)
## converting counts to integer mode
#variance stabilizing transformation
sub2_vst <- varianceStabilizingTransformation(sub2.ds , blind=FALSE, fitType="local") 
counts_VST <- otu_table(as.matrix(assay(sub2_vst)), taxa_are_rows=TRUE)
otu_table(sub2_VST) <- counts_VST 


#change the names
tax = data.frame(tax_table(sub2))
names = tax$Highest.Rank
otus= data.frame(otu_table(sub2))

#generate the corrrelation matrix and assign the taxa names to rows and columns
cormat = round(cor(otus), 1)
dim(cormat)
## [1] 70 70
rownames(cormat) = names
colnames(cormat) = names

#generate pvalues
p.mat <- cor_pmat(otus)

p=ggcorrplot(cormat, hc.order = TRUE, type="full", hc.method = "average", lab_size = 6, tl.cex=6, sig.level=0005, colors = c("turquoise1", "white", "coral")) + theme(text = element_text(size=12), axis.text.x = element_text(angle=90, vjust=1)) +theme(legend.position="bottom")


#test figure
p

ggsave(p, file="~/Dropbox/Figure2b.eps",  width = 10, height = 8, units ="in",dpi = 300, device="eps")

Figure 2c: plot the distribution of select taxa that vary significantly across sites

keep = c("Corynebacterium durum", "Rothia dentocariosa", "Veillonella dispar str. 01", "Haemophilus parainfluenzae",  "Rothia aeria")


#rename taxa names with highest rank
tax = data.frame(tax_table(is2))
temp = is2
taxa_names(temp) = tax$Highest.Rank
sub1 = prune_taxa(keep, temp)

CS1.phys = sub1
  #get the data into data.frame
  CS1.df <- data.frame(tax_table(CS1.phys), t(otu_table(CS1.phys)),
                       taxa_names(CS1.phys))
  colnames(CS1.df)[395] <- "RSV"
  CS1.dfm <- melt(CS1.df, id.var=c("RSV", colnames(tax_table(CS1.phys))))
  
  #make a denovo mapping file to fix what was lost during merge
  map1 = read.csv("~/Desktop/Proctor_NatureComm/mytoothdot_coordinates_FullMouth.csv")

  colnames(map1)[1] <- "Habitat"
  
  fm = CS1.dfm$variable
  fm1 = str_replace_all(fm, "_S", ".S")
  fm2 = colsplit(fm1, "[(.)]", c("J1", "J2", "Habitat"))
  CS1.dfm$Habitat = fm2$Habitat
  
  CS1.dfm = join(CS1.dfm, map1, by="Habitat")
  CS1.dfm$Tooth_Number <- as.factor(CS1.dfm$Tooth_Number)
#re-create the subject variable
foo = CS1.dfm$variable 
foo2 = colsplit(foo, "[(_)]", c("Subject", "Site"))
foo2 = str_replace_all(foo2$Subject, "X", "")
CS1.dfm$Subject = foo2

CS1.dfm$value = CS1.dfm$value + 1

f=ggplot(CS1.dfm, aes(as.numeric(Tooth_Number), value, group=Highest.Rank, color=Highest.Rank))  + geom_smooth(se=TRUE) + theme_bw() + facet_wrap(~Tooth_Aspect, ncol=1)+ scale_y_continuous(trans='log2') + xlab("Tooth Number") + ylab("Log2 Abundance")  + guides(color=guide_legend(title="Taxon"))   + scale_x_continuous(breaks = seq(from=0, to=30, by=5), labels=seq(from=0, to=30, by=5))+theme(legend.position="none")
f
## `geom_smooth()` using method = 'loess'

ggsave(f, file="~/Dropbox/Figure2c.png",  width = 3.5, height = 6, units ="in",dpi = 300, device="png")  
## `geom_smooth()` using method = 'loess'
#note: i printed the p and f plots and put together the figure in powerpoint

4. Figure 3: Do communities vary across a gradient on the oral mucosa also (i.e., mucosal biogeography dataset)?

The ordination method presented so far demonstrate that the data conform to a spatial pattern, but none of these models explicitly model the variance as a function of geographic location. Moreover, we also wanted to know whether communities inhabiting the mucosal surfaces (e.g., buccal mucosa, alveolar mucosa, keratinized gingiva) also vary along an ecological gradient.

To examine the variation of OTUs across a spatial gradient, we performed another PCA-IV, but instead of constraining by a factor (e.g., Habitat), we constrained by a third order polynomial function of the geographic coordinates. - Constrain ordination of the PCA Correlation Matrix by the polynomial function - Polynomial function: Y ~ X + X^2 + X^3 + Y + XY + X^2Y + Y^2 + XY^2 + Y^3 - Maximize the variance explained by the polynomial terms - Sometimes called Trend Surface Analysis - We see that the first 4 axes explain a chunk of the variation

#convert the biogeography datatset to trelative abundance
bioh.otus = data.frame(otu_table(Biogeo2_phys15))
bioh.euc = vegdist(bioh.otus, "euclidean")
bioh.nmds = cmdscale(bioh.euc, eig=TRUE, k=10)
bioh.map = data.frame(sample_data(Biogeo2_phys15), scores(bioh.nmds))
h.map$method = "Hellinger"
h.eig = 100*(h.nmds$eig/sum(h.nmds$eig))
evals = h.nmds$eig
head(h.eig)
## [1] 23.478257 10.985589  9.710830  7.991654  6.795043  5.868218
df = data.frame(h.eig[1:10], 1:10)
colnames(df) = c("Percent.Variation", "rank")

p=ggplot(df, aes(rank, Percent.Variation)) + geom_point() + theme_bw()

p

3. Figure 3, Do PCA-IV on geographic coordinates to see if gradient is evident in mucosal communities

get environmental data

#coordinates
map <- sample_data(Biogeo2_VST)
coo <- cbind(map$x, map$y)
colnames(coo) <- c("x", "y")

#hellinger transform the data
otus = otu_table(Biogeo2_phys15)
otus.h = decostand(otus, "hellinger")

### Perform  PCA-IV with respect to a 3rd order polynomial of geographic coordinates
poly.xy3 <- poly(coo, degree = 3, raw=FALSE) #, coefs=TRUE) 
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)

library(ade4)
#totus <- data.frame(t(otu_table(Biogeo2_VST)))
rld.pca <- dudi.pca(otus.h, center=TRUE, scale=TRUE, scannf=FALSE, nf=10)
rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 6)
rld.xy3.df <- data.frame(rld.xy3$ls, map)
xy3.df <- data.frame(rld.xy3.df, poly.xy3)

# how much of the variance does this model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
rld.xy3.var
## [1] 10.48273
# get the eigen values
pca.scree <- data.frame(rank = 1:length(rld.xy3$eig))
Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
Explainedvariance
## [1] 49.350953 20.839256 16.951392  4.386053  2.489342  2.162378  1.558638
## [8]  1.243442  1.018547
#how much variance does the first axis explain?
(Explainedvariance[1]/100)*rld.xy3.var
## [1] 5.173326
#force x, y to numeric
xy3.df$x <- as.numeric(as.character(xy3.df$x))
xy3.df$y <- as.numeric(as.character(xy3.df$y))

### Force variables to order in ggplot
xy3.df$Tooth<- as.numeric(as.character(xy3.df$Tooth))


#order the sites for plotting
order <- c("Alveolar Mucosa   ", "Keratinized Gingiva   ", "Supragingival Tooth   ", "Buccal Mucosa   ")
xy3.df$Site.New <- factor(xy3.df$Site.New, as.character(order))
xy3.df$Class = str_replace_all(xy3.df$Class, "CentralIncisor", "Incisor")
xy3.df$Class = str_replace_all(xy3.df$Class, "LateralIncisor", "Incisor")
cbPalette <- c("grey76", "Salmon", "#00BFC4", "grey76")


#plot Axis 1 over teeth 
fig3 = ggplot(xy3.df, aes(Tooth, Axis1)) +geom_smooth(fill = "gray90", alpha=1) + geom_point(aes(colour=Class)) + scale_colour_manual(values=cbPalette) +facet_wrap(Aspect~Site.New, ncol=4, scales="free_y") + theme_bw() + ylab("Axis 1") + xlab("Tooth") + ggtitle("")+ theme(plot.title = element_text(size=15)) +scale_x_continuous(breaks = seq(from=0, to=35, by=5), labels=seq(from=0, to=35, by=5))


ggsave(fig3, file="~/Dropbox/Figure3.eps",  width = 10, height = 8, units ="in",dpi = 300, device="eps")  
## `geom_smooth()` using method = 'loess'
print(fig3)
## `geom_smooth()` using method = 'loess'

3.1. Do forward selection of the polynomial terms for the RDA

#third order non-orthogonal polynomial
fullmouth.trendnon.rda <- rda(otus.h ~., data=as.data.frame(poly.xy3))
(R2adj.poly <- RsquareAdj(fullmouth.trendnon.rda)$adj.r.squared)
## [1] 0.07541264
# do the forward selection
library(packfor)
fullmouth.trend.fwd <- forward.sel(otus.h, poly.xy3, adjR2thresh=R2adj.poly)
## Testing variable 1
## Testing variable 2
## Testing variable 3
## Testing variable 4
## Testing variable 5
## Testing variable 6
## Procedure stopped (adjR2thresh criteria) adjR2cum = 0.075776 with 6 variables (superior to 0.075413)
fullmouth.trend.fwd 
##   variables order          R2      R2Cum   AdjR2Cum         F  pval
## 1       Y^2     7 0.019646491 0.01964649 0.01783102 10.821714 0.001
## 2       X^2     2 0.027058934 0.04670543 0.04316815 15.299327 0.001
## 3         Y     4 0.015807555 0.06251298 0.05728536  9.071555 0.001
## 4       Y^3     9 0.008504480 0.07101746 0.06409766  4.916030 0.001
## 5      X^2Y     6 0.010867203 0.08188466 0.07332016  6.344324 0.001
## 6         X     1 0.004141985 0.08602665 0.07577648  2.424537 0.019
#perform a new RDA using the significant terms
fullmouth.trend.rda2 <- rda(otus.h~., data=as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]])


#what is the R2 of the new model
(R2adj.fwd <- RsquareAdj(fullmouth.trend.rda2)$adj.r.squared)
## [1] 0.07577648
#test the significance of the model 
set.seed(44)
bio.aov = anova.cca(fullmouth.trend.rda2, step=1000)
bio.aov
## Permutation test for rda under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: rda(formula = otus.h ~ `Y^2` + `X^2` + Y + `Y^3` + `X^2Y` + X, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
##           Df Variance      F Pr(>F)    
## Model      6  0.03557 8.3927  0.001 ***
## Residual 535  0.37786                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test the significance of the individual axes
set.seed(99)
axes.aov = anova.cca(fullmouth.trend.rda2, step=1000, by="axis")
axes.aov
## Permutation test for rda under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
## 
## Model: rda(formula = otus.h ~ `Y^2` + `X^2` + Y + `Y^3` + `X^2Y` + X, data = as.data.frame(poly.xy3)[, fullmouth.trend.fwd[, 2]])
##           Df Variance       F Pr(>F)    
## RDA1       1  0.01569 22.2202  0.001 ***
## RDA2       1  0.00898 12.7122  0.001 ***
## RDA3       1  0.00705  9.9820  0.001 ***
## RDA4       1  0.00184  2.6011  0.102    
## RDA5       1  0.00133  1.8866  0.185    
## RDA6       1  0.00067  0.9541  0.435    
## Residual 535  0.37786                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the 5 significant spatial structures (i.e., canonical axes)
#note that scaling to 1 displays the spatial model (i.e., the linear combo of spatial variables while preserving the Euclidean distance among sites)
fullmouth.trend.fit <- scores(fullmouth.trend.rda2, choices=c(1, 2, 3, 4, 5), display="lc", scaling=1)
fit = data.frame(fullmouth.trend.fit, sample_data(Biogeo2_phys15))
fit$x  = as.numeric(as.character(fit$x))
fit$y  = as.numeric(as.character(fit$y))

library(RColorBrewer)
myPalette = colorRampPalette(brewer.pal(11, "RdBu"), space="Lab")

#plot the first axis
p1 = ggplot(fit, aes(x, y, color=RDA1)) +ggtitle("RDA1") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw() + facet_wrap(~Site.New, ncol=4)

#plot the second axis
p2 = ggplot(fit, aes(x, y, color=RDA2)) +ggtitle("RDA2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)


#plot the third axis
p3 = ggplot(fit, aes(x, y, color=RDA3)) +ggtitle("RDA3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)



grid.arrange(p1, p2, p3, ncol=3)

3.2. Supplemental Figure S11: Plot the other significant axes for the mucosal biogeography dataset
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Alveolar Mucosa   ", "AM")
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Keratinized Gingiva   ", "KG")
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Supragingival Tooth   ", "Tooth")
xy3.df$Site.New = str_replace_all(xy3.df$Site.New, "Buccal Mucosa   ", "BM")
  
a2 =  ggplot(xy3.df, aes(Tooth, Axis2))  + theme_bw() + ylab("Axis 2") + facet_wrap(Aspect~Site.New, ncol=4, scales="free_y")+ xlab("Tooth") + xlab("Tooth")   +theme(legend.position="none") + theme(plot.title = element_text(size=8)) + geom_smooth() + geom_point()   + theme(text = element_text(size=10), axis.text.x = element_text(angle=00, vjust=1))

a3 =  ggplot(xy3.df, aes(Tooth, Axis3))  + theme_bw() + ylab("Axis 3") + facet_wrap(Aspect~Site.New, ncol=4, scales="free_y")+ xlab("Tooth") + xlab("Tooth")   +theme(legend.position="none") + theme(plot.title = element_text(size=8)) + geom_smooth() + geom_point()+ theme(text = element_text(size=10), axis.text.x = element_text(angle=00, vjust=1))

grid.arrange(a2, a3, ncol=2)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'

#ggsave(grid.arrange(a2, a3, ncol=1), file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS11a.png",  width = 6, height = 9, units ="in",dpi = 300, device="png")

3.3. Plot the significant polynomial terms

  • by looking at these, we can gain additional insight into the spatial variables that explain variation in community structure
sig = fullmouth.trend.fwd$variables
sig = str_replace_all(sig, "[(^)]", ".")
xy.sig = data.frame(poly.xy3)[sig]

#subset on the significant polynomial terms
df = as.data.frame(poly.xy3)[,fullmouth.trend.fwd[,2]]
df = data.frame(df, sample_data(Biogeo2_phys15))
df$x = as.numeric(as.character(df$x))
df$y = as.numeric(as.character(df$y))
dfm = melt(df, id.vars = colnames(sample_data(Biogeo1_phys)))
#plot the first axis
p1 = ggplot(df, aes(x, y, color=X.2)) +ggtitle("X^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")

p2 = ggplot(df, aes(x, y, color=Y.2)) +ggtitle("Y^2") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")

p3 = ggplot(df, aes(x, y, color=Y)) +ggtitle("Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")

p4 = ggplot(df, aes(x, y, color=Y.3)) +ggtitle("Y^3") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")

p5 = ggplot(df, aes(x, y, color=X.2Y)) +ggtitle("X^2*Y") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")

p6 = ggplot(df, aes(x, y, color=X)) +ggtitle("X") + geom_point(aes(x=x, y=y)) + coord_fixed() +scale_x_continuous(limits=c(-1,1)) + geom_jitter(position=position_jitter(width=0.025, height = 0.025), size=3) + scale_color_gradientn(colours=myPalette(100)) + theme_bw()+ facet_wrap(~Site.New, ncol=4)+theme(legend.position = "none")

grid.arrange(p1, p2, p3,p4, p5, p6, ncol=2)

#ggsave(grid.arrange(p1, p2, p3,p4, p5, p6, ncol=2),file="~/Dropbox/Figures_20170617/Revised/FinalFigures/FigureS11b.png",  width = 10, height = 9, units ="in",dpi = 300, device="png")

5.1. Prelude to Figure 5, Does low salivary flow impact the spatial organization of taxa?

First, let’s take a look at the differences between the salivary flow rates of the control and SS populations. We’ll look at the Unstimulated Whole Saliva Flow Rate (UWS-FR) as well as the stimulated whole salivary flow rate (SWS-FR).

  • Import the data from redcap then merge with the study-specific identifiers
  • Add an aim variable to the dataset
  • To avoid treating repeated measures as independent samples for the aim 1 controls, we’ll get the mean of the flow rates for each subject and then perform Welch’s two-sample t-test comparing flow rates between groups.
  • This analysis shows that both the UWS-FR and and the SWS-FR varies significantly between the two groups
  • We see that the UWS-FR within each subject is highly corrrelated with the SWS-FR within that same subject
  • We also note that one SS subject has a flow rate that’s comparable to controls
  • Note redcap 63 didn’t have an UWS-FR taken on day 1 because of a procedural deviation
  • Redcap 239 (aim 3) has a negative UWS-FR for d01; it is based on one measurement only
  • Redcap 248 (aim 3) is assumed to have a collection period of 5 minutes
  • Redcap 219 is missing d01 flow rates; UWS-FR is based on one measurement only
  • Redcap 303 SWS-FR is missing and so the value could not be determined
#read in the pids
data1 = read.csv("~/Desktop/Proctor_NatureComm/pids.csv")
redcap.id = levels(as.factor(data1$redcap_record_num))

#read in the flow rates
data2 = read.csv("/Users/dmap/Desktop/Proctor_NatureComm/flow_rates_20170710_v2.0.csv")

#merge the pids with the flow rates
data3 = join(data1, data2, "redcap_record_num")

#add aim variable
sa1 = c(17 , 18, 36 , 39 , 49 , 63 , 170)
data3$aim = ifelse(data3$redcap_record_num %in% sa1, "Control", "Sjogrens")
data3$dcoll_sws_flowrate = as.numeric(as.character(data3$dcoll_sws_flowrate))
data3$dcoll_uws_flowrate = as.numeric(as.character(data3$dcoll_uws_flowrate))

#look at it
head(data3)
##   redcap_record_num   pid  X        redcap_event_name dcoll_uws_flowrate
## 1                17 1-101  8 d00_screening_exam_arm_1                 NA
## 2                17 1-101 23                d01_arm_1          0.4270000
## 3                17 1-101 35                d08_arm_1          0.6791350
## 4                17 1-101 42                d15_arm_1          0.5094678
## 5                17 1-101 48                d22_arm_1          0.6074000
## 6                17 1-101 53                d29_arm_1          0.3364000
##   dcoll_sws_flowrate escr_uws_fr Day Notes     aim
## 1                 NA       0.611   0  None Control
## 2           7.207034        <NA>   1  None Control
## 3           6.735644        <NA>   8  None Control
## 4           7.284400        <NA>  15  None Control
## 5           7.228015        <NA>  22  None Control
## 6           4.458800        <NA>  29  None Control
#how correlated is UWS-FR with SWS-FR?
p2 =ggplot(data3, aes(dcoll_uws_flowrate, dcoll_sws_flowrate, color=aim)) + geom_point() + theme_bw() + ggtitle("Correlation UWS-FR: SWS-FR")
p2

cor.test(data3$dcoll_uws_flowrate, data3$dcoll_sws_flowrate, method="pearson", exact=TRUE,na.rm=TRUE)
## 
##  Pearson's product-moment correlation
## 
## data:  data3$dcoll_uws_flowrate and data3$dcoll_sws_flowrate
## t = 4.5063, df = 35, p-value = 7.057e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3507946 0.7773392
## sample estimates:
##       cor 
## 0.6059399
#now let's look at some boxplot
dfm = melt(data3, id.var=c("redcap_record_num", "pid",  "redcap_event_name",  "Notes", "Day", "aim"))
dfm = subset(dfm, value !="NA")
dfm = subset(dfm, value !="ND")
dfm$value = as.numeric(as.character(dfm$value))

head(dfm)
##   redcap_record_num   pid        redcap_event_name Notes Day     aim
## 1                17 1-101 d00_screening_exam_arm_1  None   0 Control
## 2                17 1-101                d01_arm_1  None   1 Control
## 3                17 1-101                d08_arm_1  None   8 Control
## 4                17 1-101                d15_arm_1  None  15 Control
## 5                17 1-101                d22_arm_1  None  22 Control
## 6                17 1-101                d29_arm_1  None  29 Control
##   variable value
## 1        X     8
## 2        X    23
## 3        X    35
## 4        X    42
## 5        X    48
## 6        X    53
#get the mean flow rates
library(doBy)

#make an uws-fr df
uws = subset(dfm, variable %in% c("dcoll_uws_flowrate", "escr_uws_fr"))
#make a sws-fr df
sws = subset(dfm, variable == "dcoll_sws_flowrate")

#get significance of between group means - UWS
a1_uws = subset(uws, aim=="Control")
a3_uws = subset(uws, aim=="Sjogrens")
t.test(a1_uws$value,a3_uws$value)
## 
##  Welch Two Sample t-test
## 
## data:  a1_uws$value and a3_uws$value
## t = 8.3234, df = 36.912, p-value = 5.403e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.3121027 0.5129693
## sample estimates:
## mean of x mean of y 
## 0.5201955 0.1076595
#get significance of between group means - UWS
a1_sws = subset(sws, aim=="Control")
a3_sws = subset(sws, aim=="Sjogrens")
t.test(a1_sws$value,a3_sws$value)
## 
##  Welch Two Sample t-test
## 
## data:  a1_sws$value and a3_sws$value
## t = 6.3965, df = 15.333, p-value = 1.083e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.783452 5.557657
## sample estimates:
## mean of x mean of y 
##  6.131839  1.961285
#generate the average subject value for the flow rates in aim 1 since we have repeated measures and don't want those to be treated as if they were independent in the t.test
uws_df = summaryBy(value~pid+redcap_record_num, data=uws, FUN=mean, na.rm=TRUE)
sws_df = summaryBy(value~pid+redcap_record_num, data=sws, FUN=mean, na.rm=TRUE)

#merge the mean uws-fr, sws-fr, pid and microbial data
frs = join (uws_df, sws_df, by=c("pid", "redcap_record_num"))

colnames(frs)[1] = c("Subject")
colnames(frs)[3:4] = c("UWS_FR", "SWS_FR")

### Merge the flow rate data with the microbial data
flow <- subset_samples(supra, Day=="1" & Protocol=="Clinic")
flow # 480 taxa and 923 samples
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 923 samples ]
## sample_data() Sample Data:       [ 923 samples by 30 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 10 taxonomic ranks ]
flow.map <- data.frame(sample_data(flow))


#merge the sequencing data with the clinical data
flow.map2 = join(flow.map, frs, "Subject")

5.2. Bring in the clinical data - in this section, we bring in caries

  • note that we examined the occlusal surfaces as well, but those data are not represented here since we only sampled the buccal and lingual surfaces of teeth
  • We see that in general the caries experience of SS subjects is widespread whereas for the control subjects it’s limited to the molars and incisors
  • Important to note that these data only represent the smooth surfaces!!
#read in the caries data and then subset on the redcap ids that are included in the dataset
data = read.csv("~/Desktop/Proctor_NatureComm/data_caries_combined_v1.2_20170328.csv")
colnames(data)[3] = "redcap_record_num"
data1 = subset(data, redcap_record_num %in% redcap.id)

#subset on buccal and lingual (excluding the occlusal surfaces as well as the mesial- and distal- sites) and get rid of wisdom teeth
bl = subset(data1, surface %in% c("B", "L"))
nowise = c(2:15, 18:31)
bl = subset(bl, tooth %in% nowise)

#convert disease to 1, sound to 0
bl$binary = ifelse(bl$description =="SOUND", 0, 1)
bl$binary = as.factor(as.numeric(bl$binary))

#count the number of decayed missing teeth per subject
bl.sum = summaryBy(binary~redcap_record_num, data=bl, FUN=sum)
bl.sum$prop = bl.sum/28
colnames(bl)[3]= "redcap_record_num"
colnames(bl)[8]= "Tooth_Number"
colnames(bl)[9]= "Tooth_Aspect"
bl$Tooth_Aspect = str_replace_all(bl$Tooth_Aspect, "([B,L])", replacement =c("Buccal", "Lingual"))
bl = bl[,3:11]


#join the caries data with the microbial data
total3 = join(flow.map2, bl, b=c("redcap_record_num", "Tooth_Number", "Tooth_Aspect"))
total3$x <- as.numeric(as.character(total3$x))
total3$y <- as.numeric(as.character(total3$y))
rownames(total3) = total3[,1]
total3 = sample_data(total3)

# relabel the subjects so they will correspond to the way they are labeled in figure 5
#relabel controls that have symetrical gradients
total3$Subject = str_replace_all(total3$Subject, "1-101", "Control 01")
total3$Subject = str_replace_all(total3$Subject, "1-103", "Control 02") 
total3$Subject = str_replace_all(total3$Subject, "1-104", "Control 03") 
total3$Subject = str_replace_all(total3$Subject, "1-106", "Control 04") 
total3$Subject = str_replace_all(total3$Subject, "1-107", "Control 05") 

#relabel the controls that have flat maxillary lines
total3$Subject = str_replace_all(total3$Subject, "1-102", "Control 06")
total3$Subject = str_replace_all(total3$Subject, "1-105", "Control 07") 


#relabels sjogrens that have symetrical curves
total3$Subject = str_replace_all(total3$Subject, "3-305", "Sjogrens 01") 
total3$Subject = str_replace_all(total3$Subject, "3-302", "Sjogrens 02") 
total3$Subject = str_replace_all(total3$Subject, "3-306", "Sjogrens 03") 


#relabel sjogren's that have flat lines
total3$Subject = str_replace_all(total3$Subject, "3-304", "Sjogrens 04") 
total3$Subject = str_replace_all(total3$Subject, "3-308", "Sjogrens 05") 
total3$Subject = str_replace_all(total3$Subject, "3-310", "Sjogrens 06") 
total3$Subject = str_replace_all(total3$Subject, "3-309", "Sjogrens 07") 


total3$Subject = str_replace_all(total3$Subject, "3-303", "Sjogrens 08") 
total3$Subject = str_replace_all(total3$Subject, "3-307", "Sjogrens 09") 

#relabel sjogrens that have exaggerated  lines

total3$Subject = str_replace_all(total3$Subject, "3-301", "Sjogrens 10") 


#make the total3 variable the new mapping file for the object flow
sample_data(flow) = total3

5.3. Figure 5, Does salivary flow impact the structure of bacterial communities?

  • all flow rates but the one for redcap 39 are from the ESE
  • flow rate for redcap 39 is from day 1 of sample collection since data were missing for the ESE
#how many samples and subjects are there?
flow 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 923 samples ]
## sample_data() Sample Data:       [ 923 samples by 39 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 10 taxonomic ranks ]
levels(sample_data(flow)$Subject) #7 controls; 10 aim 3
## NULL
#get ridof subjects with missing flow rate for the cca
flows = subset_samples(flow, UWS_FR & SWS_FR != "NA")
levels(sample_data(flows)$Subject) #7 controls; lost subject 3-310 because of sws-fr
## NULL
#subset on taxa present in 10% of samples
filtergroup = filterfun(kOverA(k=75, A=1)) #k = number of samples; A = abundance
        flows = filter_taxa(flows, filtergroup, prune=TRUE) 
        flows = prune_taxa(taxa_sums(flows) > 0, flows)
        flows = prune_samples(sample_sums(flows) > 0, flows)
        flows
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 147 taxa and 825 samples ]
## sample_data() Sample Data:       [ 825 samples by 39 sample variables ]
## tax_table()   Taxonomy Table:    [ 147 taxa by 10 taxonomic ranks ]
#stabilize the variance
flows_VST = flows
otu_table(flows_VST) <- otu_table(flows_VST) + 1
flows.ds = phyloseq_to_deseq2(flows_VST, ~PoolName)
## converting counts to integer mode
#variance stabilizing transformation
flows_vst <- varianceStabilizingTransformation(flows.ds , blind=FALSE, fitType="local") 
counts_VST <- otu_table(as.matrix(assay(flows_vst)), taxa_are_rows=TRUE)
otu_table(flows_VST) <- counts_VST 


#set an analysis up for CCA
otus = t(otu_table(flows_VST))
map1 = data.frame(sample_data(flows_VST)$Tooth_Class, sample_data(flows_VST)$UWS_FR, sample_data(flows_VST)$SWS_FR, sample_data(flows_VST)$binary)
colnames(map1) = c("Class", "uws_fr", "sws_fr", "dmfs")
map1$Class = as.numeric(map1$Class)
subs = as.vector(sample_data(flows_VST)$Subject)
runs = as.vector(sample_data(flows_VST)$Pool_Name)

#for some reason this wworks on the command line but not in knitr
attach(map1)
flow.cca = vegan::cca(otus ~ map1$Class*map1$uws_fr*map1$sws_fr*map1$dmfs, sitenv=map1)

#run an anova on this
set.seed(100)
flow.aov = anova.cca(flow.cca, by="term", strata=subs)
flow.aov
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = otus ~ map1$Class * map1$uws_fr * map1$sws_fr * map1$dmfs, sitenv = map1)
##                                               Df ChiSquare        F Pr(>F)
## map1$Class                                     1  0.001240   4.2099  0.001
## map1$uws_fr                                    1  0.030191 102.5027  0.001
## map1$sws_fr                                    1  0.008321  28.2490  0.001
## map1$dmfs                                      1  0.002725   9.2517  0.271
## map1$Class:map1$uws_fr                         1  0.000473   1.6048  0.001
## map1$Class:map1$sws_fr                         1  0.000418   1.4183  0.004
## map1$uws_fr:map1$sws_fr                        1  0.023267  78.9949  0.001
## map1$Class:map1$dmfs                           1  0.000458   1.5557  0.247
## map1$uws_fr:map1$dmfs                          1  0.001513   5.1381  0.190
## map1$sws_fr:map1$dmfs                          1  0.001976   6.7094  0.106
## map1$Class:map1$uws_fr:map1$sws_fr             1  0.000377   1.2793  0.007
## map1$Class:map1$uws_fr:map1$dmfs               1  0.000342   1.1603  0.164
## map1$Class:map1$sws_fr:map1$dmfs               1  0.000551   1.8719  0.187
## map1$uws_fr:map1$sws_fr:map1$dmfs              1  0.001440   4.8885  0.807
## map1$Class:map1$uws_fr:map1$sws_fr:map1$dmfs   1  0.000424   1.4379  0.212
## Residual                                     809  0.238285                
##                                                 
## map1$Class                                   ***
## map1$uws_fr                                  ***
## map1$sws_fr                                  ***
## map1$dmfs                                       
## map1$Class:map1$uws_fr                       ***
## map1$Class:map1$sws_fr                       ** 
## map1$uws_fr:map1$sws_fr                      ***
## map1$Class:map1$dmfs                            
## map1$uws_fr:map1$dmfs                           
## map1$sws_fr:map1$dmfs                           
## map1$Class:map1$uws_fr:map1$sws_fr           ** 
## map1$Class:map1$uws_fr:map1$dmfs                
## map1$Class:map1$sws_fr:map1$dmfs                
## map1$uws_fr:map1$sws_fr:map1$dmfs               
## map1$Class:map1$uws_fr:map1$sws_fr:map1$dmfs    
## Residual                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#how much variance does the model explain
eig = flow.cca$CCA$eig
eigp = 100*(eig/sum(eig))
head(eigp)
##      CCA1      CCA2      CCA3      CCA4      CCA5      CCA6 
## 59.737865 15.292899 11.534283  3.606900  2.646815  2.153165
#how much does the first axis explain?
eigp[1]
##     CCA1 
## 59.73787
#hwat about the second?
eigp[2]
##    CCA2 
## 15.2929
#get the sample site scores and plot them
sites = data.frame(flow.cca$CCA$wa, sample_data(flows_VST))
sites$Aim = ifelse(sites$Aim=="SA1", "Control", "Sjogrens")


#convert the MFS score to factor
sites$binary = as.factor(as.numeric(sites$binary))

p1 = ggplot(sites, aes(CCA1, CCA2, color=as.factor(Aim))) + geom_point() + theme_bw() + guides(color=guide_legend(title="Health Status")) + ggtitle("a)")  + xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)") + theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))

p2 = ggplot(sites, aes(CCA1, CCA2, color=uws_fr)) + theme_bw() + guides(color=guide_legend(title="UWS-FR")) + ggtitle("b)") + scale_colour_gradientn(colours=c("red", "blue")) + labs(title = "b)" ) + xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)")+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1)) + geom_point()

p3 = ggplot(sites, aes(CCA1, CCA2, color=sws_fr))  + theme_bw() + guides(color=guide_legend(title="SWS-FR")) + ggtitle("c)") + scale_colour_gradientn(colours=c("red", "blue")) + labs(title = "c)" )+ xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)")+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))+ geom_point()

p4 = ggplot(sites, aes(CCA1, CCA2, color=as.factor(binary))) + theme_bw() + guides(color=guide_legend(title="MFS")) + ggtitle("d)")  + labs(title = "d)" )+ xlab("CCA1 (59.74%)") + ylab("CCA2 (15.29%)")+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))+ geom_point()+ scale_colour_manual(values = c("red", "blue"))

p5 = ggplot(sites, aes(uws_fr, CCA1, color=Aim)) + geom_jitter() +theme_bw() + ggtitle("e)") + ylab("CCA1 (59.74%)") +xlab("UWS-FR")+ guides(color=guide_legend(title="Health Status"))+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))

p6 = ggplot(sites, aes(sws_fr, CCA1, color=Aim)) + geom_jitter() +theme_bw() + ggtitle("f)")  +xlab("SWS-FR") + ylab("CCA1 (59.74%)") + guides(color=guide_legend(title="Health Status"))+ theme(text = element_text(size=10, family="Helvetica"), axis.text.x = element_text(angle=00, vjust=1))


fig5= arrangeGrob(p1, p2,  p3, p4, p5, p6, ncol=3)
ggsave(grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3), file="~/Dropbox/Figure4.eps",  width = 11, height = 8.5, units ="in",dpi = 300)

grid.arrange(p1, p2, p3, p4, p5, p6, ncol=3)

5.4. look at the distribution of SS samples along axis 1

ss = subset(sites, Aim=="Sjogrens")
### how many ss samples map to negative axis 1 scores
neg = subset(ss, CCA1 < 0)
ss.neg= subset(neg, Aim=="Sjogrens")
nrow(ss.neg)/nrow(ss)
## [1] 0.3265306
### how many ss samples map to positive axis 1 scores
pos = subset(ss, CCA1 > 0)
ss.pos= subset(pos, Aim="Sjogrens")
nrow(ss.pos)/nrow(ss)
## [1] 0.6734694
########## look at the distribution of control samples along axis 1
cont = subset(sites, Aim=="Control")
### how many ss samples map to negative axis 1 scores
neg = subset(cont, CCA1 < 0)
nrow(neg)/nrow(cont)
## [1] 0.8880208
### how many ss samples map to positive axis 1 scores
pos = subset(cont, CCA1 > 0)
nrow(pos)/nrow(cont)
## [1] 0.1119792

6. Figure 6, Does low salivary flow impact the spatial organization of taxa?

#how many samples and subjects are there?
flow 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 480 taxa and 923 samples ]
## sample_data() Sample Data:       [ 923 samples by 39 sample variables ]
## tax_table()   Taxonomy Table:    [ 480 taxa by 10 taxonomic ranks ]
levels(sample_data(flow)$Subject) #7 controls; 10 aim 3
## NULL
#stabilize the variance
flows_VST = flow
otu_table(flows_VST) <- otu_table(flows_VST) + 1
flows.ds = phyloseq_to_deseq2(flows_VST, ~PoolName)
## converting counts to integer mode
#variance stabilizing transformation
flows_vst <- varianceStabilizingTransformation(flows.ds , blind=FALSE, fitType="local") 
counts_VST <- otu_table(as.matrix(assay(flows_vst)), taxa_are_rows=TRUE)
otu_table(flows_VST) <- counts_VST 

#transform sample counts for the PCA-IV
library(ade4)
otus = data.frame(otu_table(flow))
otus.h = decostand(otus, "hellinger")
map <- sample_data(flow)
        coo <- cbind(map$x, map$y)
        colnames(coo) <- c("x", "y")
 
### Perform  PCA-IV with respect to a 3rd order polynomial of geographic coordinates
poly.xy3 <- poly(coo, degree = 3, raw=FALSE) #, coefs=TRUE) 
colnames(poly.xy3) <- c("X", "X^2", "X^3", "Y", "XY", "X^2Y", "Y^2", "XY^2", "Y^3")
        poly.xy3.df <- data.frame(poly.xy3, map$x, map$y)
        library(ade4)
        #totus <- t(otu_table(flows_VST))
        rld.pca <- dudi.pca(otus.h, center=TRUE, scale=TRUE, scannf=FALSE, nf=5)
        rld.xy3 <- pcaiv(rld.pca, poly.xy3, scannf = FALSE, nf = 2)
        rld.xy3.df <- data.frame(rld.xy3$ls, map)
        xy3.df <- data.frame(rld.xy3.df, poly.xy3)

# how much of the variance does this model explain?
rld.xy3.var <- sum(rld.xy3$eig)/sum(rld.pca$eig)*100
        rld.xy3.var
## [1] 1.55107
# get the eigen values
pca.scree <- data.frame(rank = 1:length(rld.xy3$eig))
        Explainedvariance = rld.xy3$eig/sum(rld.xy3$eig)*100
        Explainedvariance
## [1] 29.613705 19.335762 13.964370  8.497993  7.825626  6.704618  5.511524
## [8]  4.730248  3.816154
#organize the data for plotting
xy3.df$x <- as.numeric(as.character(xy3.df$x))
        xy3.df$y <- as.numeric(as.character(xy3.df$y))
        xy3.df$Tooth_Number <- as.numeric(as.character(xy3.df$Tooth_Number))
        #force the tooth to order
        ordering = 1:32
        xy3.df$interval <- factor(xy3.df$Tooth_Number, as.character(ordering))
        xy3.df$Jaw = ifelse(xy3.df$Jaw_Quadrant %in% c(1, 2), "Maxilla", "Mandible")
        xy3.df$Aim = ifelse(xy3.df$Aim=="SA1", "Control", "Sjogrens")
        upper <- 1:2
        lower <- 3:4
        upper.df <- subset(xy3.df, Jaw_Quadrant %in% upper)
        lower.df <- subset(xy3.df, Jaw_Quadrant %in% lower)

#replace subject labels for plotting
xy3.df$Subject = as.factor(xy3.df$Subject)


colnames(xy3.df)[41] = "MFS"
xy3.df$MFS = as.factor(as.numeric(xy3.df$MFS))

xy3.df$Tooth_Class = str_replace_all(xy3.df$Tooth_Class, "Molar_Pre", "Pre-molar")
xy3.df$Tooth_Class = str_replace_all(xy3.df$Tooth_Class, "Incisor_Central", "Central Incisor")
xy3.df$Tooth_Class = str_replace_all(xy3.df$Tooth_Class, "Incisor_Lateral", "Lateral Incisor")

#generate figure
fig5 = ggplot(xy3.df, aes(Tooth_Number, Axis1, color=UWS_FR)) +  geom_smooth(fill = "gray90", alpha=1) + facet_wrap(~Subject, ncol=6, scales="free") + theme_bw() + ylab("Axis 1")   + guides(fill=guide_legend(title="Health Status")) + xlab("Tooth Number") + geom_point(aes(shape=MFS), size=2) + ggtitle("")+ scale_colour_gradientn(colours=c("red", "blue")) 

fig5
## `geom_smooth()` using method = 'loess'

ggsave(fig5, file="~/Dropbox/Figure5.eps",  width = 11, height = 8.5, units ="in",dpi = 300, device="eps")  
## `geom_smooth()` using method = 'loess'
#save.image(file="Proctor_MainFigures_v28.0.Rdata")

Do molars and incisors differentiate for subjects with sjogren’s compared to controls?

  • do a wilcoxon rank sum test on axis 1 scores to see if sjogren’s samples differentiate along axis 1, as do control samples
library("ggpubr")
## Loading required package: magrittr
df  = fig5$data
df= subset(df, Tooth_Class %in% c("Central Incisor", "Molar"))
buccal = subset(df, Tooth_Aspect=="Buccal")

p=ggplot(df, aes(Tooth_Class, Axis1, color=Tooth_Class)) + geom_boxplot() + facet_wrap(~Subject) + theme_bw()+ theme(text = element_text(size=12), axis.text.x = element_text(angle=90, vjust=1))
p + stat_compare_means(method = "wilcox.test")